{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 人工智能范式（AI Paradigm）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 目录\n",
    "1. [Rule Based](#Rule-Based)\n",
    "   1. [基于语法规则](#基于语法规则)         \n",
    "   2. [基于模式或模板](#基于模式或模板)\n",
    "2. [Probability Based](#Probability-Based)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### [Rule Based](#目录)\n",
    "- 基于预先定义的规则或模式，如对话生成中，根据预先定义好的语法来生成合理的语句\n",
    "- 当规则(语法)发生了改变，整个系统可能需要重写"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 简单的语法规则\n",
    "\n",
    "simple_grammar = \"\"\"\n",
    "sentence => noun_phrase verb_phrase\n",
    "noun_phrase => Article Adj* noun\n",
    "Adj* => null | Adj Adj*\n",
    "verb_phrase => verb noun_phrase\n",
    "Article =>  一个 | 这个\n",
    "noun =>   女人 |  篮球 | 桌子 | 小猫\n",
    "verb => 看着   |  坐在 |  听着 | 看见\n",
    "Adj =>  蓝色的 | 好看的 | 小小的\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('蓝色的', '蓝色的')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成单个形容词\n",
    "\n",
    "\n",
    "def adj():\n",
    "    return random.choice(\"蓝色的 | 好看的 | 小小的\".split(\"|\")).split()[0]\n",
    "\n",
    "\n",
    "adj(), adj()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "ename": "RecursionError",
     "evalue": "maximum recursion depth exceeded while calling a Python object",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRecursionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-33-4fdab7849bba>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0madj_star\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;31m# choice 需要先执行完被选择列表中的内容再选择，所以出现无限递归的问题\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-33-4fdab7849bba>\u001b[0m in \u001b[0;36madj_star\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0madj_star\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchoice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0madj_star\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "... last 1 frames repeated, from the frame below ...\n",
      "\u001b[0;32m<ipython-input-33-4fdab7849bba>\u001b[0m in \u001b[0;36madj_star\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0madj_star\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchoice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0madj_star\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded while calling a Python object"
     ]
    }
   ],
   "source": [
    "# 生成零个或多个形容词\n",
    "\n",
    "\n",
    "def adj_star():\n",
    "    return random.choice([None, adj() + adj_star()])\n",
    "\n",
    "\n",
    "adj_star()\n",
    "\n",
    "# choice 需要先执行完被选择列表中的内容再选择，所以出现无限递归的问题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('', '小小的')"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成零个或多个形容词\n",
    "\n",
    "\n",
    "def adj_star():\n",
    "    return random.choice([lambda: '', lambda: adj() + adj_star()])()\n",
    "\n",
    "\n",
    "adj_star(), adj_star()\n",
    "# 被选择的列表为匿名函数，不会立即执行"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### [基于语法规则](#目录)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将预定义语法字符串转化成 python 格式\n",
    "\n",
    "\n",
    "def create_grammar(grammar_str, split='=>', line_split='\\n'):\n",
    "    grammar = {}\n",
    "    for line in grammar_str.split(line_split):\n",
    "        if not line.strip():\n",
    "            continue\n",
    "        exp, stmt = line.split(split)\n",
    "        grammar[exp.strip()] = [s.split() for s in stmt.split('|')]\n",
    "    return grammar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Adj*': [['null'], ['Adj', 'Adj*']], 'Adj': [['蓝色的'], ['好看的'], ['小小的']]}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成语法\n",
    "\n",
    "adj_grammar = \"\"\"\n",
    "Adj* => null | Adj Adj*\n",
    "Adj =>  蓝色的 | 好看的 | 小小的\n",
    "\"\"\"\n",
    "\n",
    "grammar = create_grammar(adj_grammar)\n",
    "grammar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据语法生成语句\n",
    "\n",
    "from random import choice\n",
    "\n",
    "\n",
    "def generate(gram, target):\n",
    "    if target not in gram:\n",
    "        return target\n",
    "    expanded = [generate(gram, t) for t in choice(gram[target])]\n",
    "    return ''.join(e for e in expanded if e != 'null')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence': [['noun_phrase', 'verb_phrase']],\n",
       " 'noun_phrase': [['Article', 'Adj*', 'noun']],\n",
       " 'Adj*': [['null'], ['Adj', 'Adj*']],\n",
       " 'verb_phrase': [['verb', 'noun_phrase']],\n",
       " 'Article': [['一个'], ['这个']],\n",
       " 'noun': [['女人'], ['篮球'], ['桌子'], ['小猫']],\n",
       " 'verb': [['看着'], ['坐在'], ['听着'], ['看见']],\n",
       " 'Adj': [['蓝色的'], ['好看的'], ['小小的']]}"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_grammar = create_grammar(simple_grammar)\n",
    "example_grammar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'这个小小的女人看见一个好看的小猫'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate(gram=example_grammar, target='sentence')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1. 模拟人类和机器人对话："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在西部世界里，一个”人类“的语言可以定义为：\n",
    "human = \"\"\"\n",
    "human = 自己 寻找 活动\n",
    "自己 = 我 | 俺 | 我们 \n",
    "寻找 = 找找 | 想找点 \n",
    "活动 = 乐子 | 玩的\n",
    "\"\"\"\n",
    "\n",
    "# 一个“接待员”的语言可以定义为\n",
    "host = \"\"\"\n",
    "host = 寒暄 报数 询问 业务相关 结尾 \n",
    "报数 = 我是 数字 号 ,\n",
    "数字 = 单个数字 | 数字 单个数字 \n",
    "单个数字 = 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 \n",
    "寒暄 = 称谓 打招呼 | 打招呼\n",
    "称谓 = 人称 ,\n",
    "人称 = 先生 | 女士 | 小朋友\n",
    "打招呼 = 你好, | 您好,\n",
    "询问 = 请问你要 | 您需要\n",
    "业务相关 = 玩玩 具体业务\n",
    "玩玩 = null\n",
    "具体业务 = 喝酒 | 打牌 | 打猎 | 赌博\n",
    "结尾 = 吗？\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "先生,您好,我是652号,请问你要打猎吗？\n",
      "我们找找乐子\n",
      "女士,你好,我是9号,您需要打牌吗？\n",
      "我找找玩的\n",
      "你好,我是36号,请问你要喝酒吗？\n",
      "我想找点乐子\n",
      "你好,我是77号,请问你要打牌吗？\n",
      "俺找找玩的\n",
      "女士,您好,我是31号,请问你要赌博吗？\n",
      "我们想找点乐子\n"
     ]
    }
   ],
   "source": [
    "for i in range(5):\n",
    "    print(generate(gram=create_grammar(host, split='='), target='host'))\n",
    "    print(generate(gram=create_grammar(human, split='='), target='human'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2. 客服机器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "service_robot = \"\"\"\n",
    "ask = 打招呼 , 自我介绍 , 询问 \n",
    "\n",
    "打招呼 = 称谓 , 招呼 \n",
    "称谓 = 亲 | 先生 | 女士 | 小朋友 | 美女| 帅哥\n",
    "人称 = 你 | 您 \n",
    "招呼 = 人称 好\n",
    "\n",
    "自我介绍 = 我是 数字 号客服|我是客服 名称\n",
    "数字 = 单个数字 | 数字 单个数字 \n",
    "单个数字 = 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9\n",
    "名称 = 小芳 | 小静 | 张三 | 王五\n",
    "\n",
    "询问 = 人称 动作 结尾\n",
    "动作 = 想知道 业务+ | 想了解 业务+ |想咨询 业务+ | 对 业务+ 感兴趣\n",
    "业务+ = 业务 | 业务 和 业务+ \n",
    "业务 = 产品详情 | 购买须知 | 优惠券 | 使用方法 | 订单查询 | 产品维修 | 退货流程 | 退款详情 | 人工客服 \n",
    "结尾 = 吗？ \n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_grammar(grammar_str, split='=', line_split='\\n'):\n",
    "    grammar = {}\n",
    "    for line in grammar_str.split(line_split):\n",
    "        if not line.strip():\n",
    "            continue\n",
    "        exp, stmt = line.split(split)\n",
    "        grammar[exp.strip()] = [s.split() for s in stmt.split('|')]\n",
    "    return grammar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ask': [['打招呼', ',', '自我介绍', ',', '询问']],\n",
       " '打招呼': [['称谓', ',', '招呼']],\n",
       " '称谓': [['亲'], ['先生'], ['女士'], ['小朋友'], ['美女'], ['帅哥']],\n",
       " '人称': [['你'], ['您']],\n",
       " '招呼': [['人称', '好']],\n",
       " '自我介绍': [['我是', '数字', '号客服'], ['我是客服', '名称']],\n",
       " '数字': [['单个数字'], ['数字', '单个数字']],\n",
       " '单个数字': [['1'], ['2'], ['3'], ['4'], ['5'], ['6'], ['7'], ['8'], ['9']],\n",
       " '名称': [['小芳'], ['小静'], ['张三'], ['王五']],\n",
       " '询问': [['人称', '动作', '结尾']],\n",
       " '动作': [['想知道', '业务+'], ['想了解', '业务+'], ['想咨询', '业务+'], ['对', '业务+', '感兴趣']],\n",
       " '业务+': [['业务'], ['业务', '和', '业务+']],\n",
       " '业务': [['产品详情'],\n",
       "  ['购买须知'],\n",
       "  ['优惠券'],\n",
       "  ['使用方法'],\n",
       "  ['订单查询'],\n",
       "  ['产品维修'],\n",
       "  ['退货流程'],\n",
       "  ['退款详情'],\n",
       "  ['人工客服']],\n",
       " '结尾': [['吗？']]}"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_gram = create_grammar(service_robot)\n",
    "example_gram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "choice = random.choice\n",
    "\n",
    "\n",
    "def generate(gram, target):\n",
    "    if target not in gram: return target\n",
    "    expaned = [generate(gram, t) for t in choice(gram[target])]\n",
    "    return \"\".join([e if e != '/n' else '\\n' for e in expaned if e != 'null'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'帅哥,您好,我是9155号客服,你想咨询优惠券和产品维修吗？'"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate(gram=example_gram, target='ask')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['小朋友,您好,我是客服小芳,您想知道退货流程和退货流程和产品维修和使用方法和产品详情吗？',\n",
       " '帅哥,你好,我是客服张三,您想咨询优惠券吗？',\n",
       " '小朋友,您好,我是4号客服,你想知道使用方法和订单查询和人工客服吗？']"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成多个句子\n",
    "\n",
    "\n",
    "def generate_n(n, gram, target):\n",
    "    return [generate(gram, target) for _ in range(n)]\n",
    "\n",
    "\n",
    "generate_n(3, gram=example_gram, target='ask')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### [基于模式或模板](#目录)\n",
    "> Pattern: (我想要A)     \n",
    "Response: (如果你有 A，对你意味着什么呢？)  \n",
    "\n",
    ">Input: (我想要度假)    \n",
    "Response: (如果你有度假，对你意味着什么呢？)\n",
    "\n",
    "为了实现模板的判断和定义，定义特殊的符号，来表示是一个占位符。例如， \"I want ?X\", 意思就是 ?X 是一个用来占位的符号\n",
    "如果输入了\"I want holiday\"， 在这里 'holiday' 就是 '?X'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(True, False)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 判断是否为变量：变量定义为以 ？开头 的单词\n",
    "\n",
    "def is_variable(pat):\n",
    "    return pat.startswith('?') and all(s.isalpha() for s in pat[1:])\n",
    "\n",
    "is_variable('?hello'), is_variable('?hello world')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 判断句子与模板是否匹配，模板中包含上述的变量\n",
    "\n",
    "\n",
    "def pat_match_1(pattern, sent):\n",
    "    if is_variable(pattern[0]):\n",
    "        return True\n",
    "    else:\n",
    "        if pattern[0] != sent[0]:\n",
    "            return False\n",
    "        else:\n",
    "            return pat_match_1(pattern[1:], sent[1:])\n",
    "        \n",
    "        \n",
    "pat_match_1(\"I want ?x\", \"I want holiday\")        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pat_match_1(\"I have dreamed a ?x\", \"I dreamed about dog\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('?X', 'holiday')"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 判断句子与模板是否匹配，获得匹配的变量\n",
    "\n",
    "\n",
    "def pat_match_2(pattern, sent):\n",
    "    if is_variable(pattern[0]):\n",
    "        return pattern[0], sent[0]\n",
    "    else:\n",
    "        if pattern[0] != sent[0]: return False\n",
    "        else:\n",
    "            return pat_match_2(pattern[1:], sent[1:])\n",
    "\n",
    "pattern = 'I want ?X'.split()\n",
    "sent = \"I want holiday\".split()\n",
    "pat_match_2(pattern, sent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('?X', '3'), ('?Y', '2')]"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# \"I need iPhone\" 和 \"I need ?X\" 可以匹配，\n",
    "# 但是\"I need an iPhone\" 和 \"I need ?X\" 就不匹配\n",
    "# 新建一个变量类型 \"?*X\", 多了一个星号(*),表示匹配多个\n",
    "# 多个部分匹配\n",
    "\n",
    "\n",
    "def pat_match(pattern, saying):\n",
    "    if not pattern or not saying:\n",
    "        return []\n",
    "    if is_variable(pattern[0]):\n",
    "        return [(pattern[0], saying[0])] + pat_match(pattern[1:], saying[1:])\n",
    "    else:\n",
    "        if pattern[0] != saying[0]:\n",
    "            return []\n",
    "        else:\n",
    "            return pat_match(pattern[1:], saying[1:])\n",
    "\n",
    "\n",
    "pat_match(\"?X greater than ?Y\".split(), \"3 greater than 2\".split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将模板中的变量替换成单词，生成句子\n",
    "\n",
    "\n",
    "def pat_to_dict(patterns):\n",
    "    return {k: v for k, v in patterns}\n",
    "\n",
    "\n",
    "def subsitite(rule, parsed_rules):\n",
    "    if not rule:\n",
    "        return []\n",
    "    return [parsed_rules.get(rule[0], rule[0])] + subsitite(\n",
    "        rule[1:], parsed_rules)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('?X', 'iPhone')]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "got_patterns = pat_match(\"I want ?X\".split(), \"I want iPhone\".split())\n",
    "got_patterns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What if you mean if you got a iPhone'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(\n",
    "    subsitite(\"What if you mean if you got a ?X\".split(),\n",
    "              pat_to_dict(got_patterns)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('?P', 'John'), ('?X', 'vacation')]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "john_pat = pat_match('?P needs ?X'.split(), \"John needs vacation\".split())\n",
    "john_pat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Why does John need vacation ?'"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(subsitite(\"Why does ?P need ?X ?\".split(), pat_to_dict(john_pat)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 基于模板生成对话"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "defined_patterns = {\n",
    "    \"I need ?X\": [\"Image you will get ?X soon\", \"Why do you need ?X ?\"],\n",
    "    \"My ?X told me something\":\n",
    "    [\"Talk about more about your ?X\", \"How do you think about your ?X ?\"]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Image you will get iPhone soon', 'Talk about more about your mother')"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "\n",
    "def get_response(saying, rules=defined_patterns):\n",
    "    \"\"\"\" \n",
    "    >>> get_response('I need iPhone') \n",
    "    >>> Image you will get iPhone soon\n",
    "    >>> get_response(\"My mother told me something\")\n",
    "    >>> Talk about more about your monther.\n",
    "    \"\"\"\n",
    "    for pat, response_pats in rules.items():\n",
    "        match = pat_match(pat.split(), saying.split())\n",
    "        if match:\n",
    "            match_dic = pat_to_dict(match)\n",
    "            response_pat = random.choice(response_pats)\n",
    "            return ' '.join(subsitite(response_pat.split(), match_dic))\n",
    "\n",
    "\n",
    "get_response('I need iPhone'), get_response(\"My mother told me something\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 上述匹配规则太死板，必须逐字逐句匹配，实现更宽泛的匹配\n",
    "\n",
    "\n",
    "def is_pattern_segment(pattern):\n",
    "    return pattern.startswith('?*') and all(a.isalpha() for a in pattern[2:])\n",
    "\n",
    "\n",
    "is_pattern_segment('?*Pos')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('?P', ['mike', 'jack']), 2)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 判断是否匹配，并获取匹配的单词\n",
    "from collections import defaultdict\n",
    "\n",
    "\n",
    "def segment_match(pattern, saying):\n",
    "    \"\"\"\n",
    "    pattern: pattern[0] 必须以 '?*' 开始, eg:'?*P'\n",
    "    \"\"\"\n",
    "    seg_pat, rest = pattern[0], pattern[1:]\n",
    "    seg_pat = seg_pat.replace('?*', '?')\n",
    "    if not rest:\n",
    "        return (seg_pat, saying), len(saying)\n",
    "    for i, token in enumerate(saying):\n",
    "        if rest[0] == token and is_match(rest[1:], saying[(i + 1):]):\n",
    "            return (seg_pat, saying[:i]), i\n",
    "    return (seg_pat, saying), len(saying)\n",
    "\n",
    "\n",
    "def is_match(rest, saying):\n",
    "    if not rest and not saying:\n",
    "        return True\n",
    "    if not all(a.isalpha() for a in rest[0]):\n",
    "        return True\n",
    "    if rest[0] != saying[0]:\n",
    "        return False\n",
    "    return is_match(rest[1:], saying[1:])\n",
    "\n",
    "\n",
    "segment_match(['?*P', 'hello', 'world'], ['mike', 'jack', 'hello', 'world'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 多位匹配，获取所有匹配的单词\n",
    "\n",
    "fail = [True, None]\n",
    "\n",
    "\n",
    "def pat_match_with_seg(pattern, saying):\n",
    "    if not pattern or not saying:\n",
    "        return []\n",
    "    pat = pattern[0]\n",
    "    if is_variable(pat):\n",
    "        return [(pat, saying[0])] + pat_match_with_seg[pattern[1:], saying[1:]]\n",
    "    elif is_pattern_segment(pat):\n",
    "        match, index = segment_match(pattern, saying)\n",
    "        return [match] + pat_match_with_seg(pattern[1:], saying[index:])\n",
    "    elif pat == saying[0]:\n",
    "        return pat_match_with_seg(pattern[1:], saying[1:])\n",
    "    else:\n",
    "        return fail"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('?PL', ['dog', 'and', 'my', 'cat']), ('?L', ['and', 'very', 'friendly'])]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pat_match_with_seg('My ?*PL are very good ?*L'.split(),\n",
    "                   \"My dog and my cat are very good and very friendly\".split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "response_pair = {\n",
    "    'I need ?X': [\"Why do you neeed ?X\"],\n",
    "    \"I dont like my ?X\": [\"What bad things did ?X do for you?\"]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('?X', ['I', 'am', 'mike,'])]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pat_match_with_seg('?*X hello ?*Y'.split(), \"I am mike, hello \".split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('?X', ['I']), ('?P', ['an', 'iPhone'])]"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pat_match_with_seg('?*X need ?*P too'.split(), \"I need an iPhone\".split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Why', 'do', 'you', 'neeed', ['an', 'iPhone']]"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成句子\n",
    "subsitite(\n",
    "    \"Why do you neeed ?X\".split(),\n",
    "    pat_to_dict(\n",
    "        pat_match_with_seg('I need ?*X'.split(), \"I need an iPhone\".split())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pat_to_dict(patterns):\n",
    "    return {k: ' '.join(v) if isinstance(v, list) else v for k, v in patterns}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Why', 'do', 'you', 'neeed', 'an iPhone']"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subsitite(\n",
    "    \"Why do you neeed ?X\".split(),\n",
    "    pat_to_dict(\n",
    "        pat_match_with_seg('I need ?*X'.split(), \"I need an iPhone\".split())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>> hello hello world!\n",
      "['How do you do', 'Why do you want ?y', 'What do you think about ?y', 'You are being a negative', 'Why do you tell me you were ?y now?', 'What other feelings do you have?']\n",
      "\n",
      ">>> Now I want to fly.\n",
      "['Please state your problem', 'Suppose you got to fly. soon', 'Do you really think its likely that ?y', 'why not?', 'Why do you tell me you were ?y now?', 'What other feelings do you have?']\n",
      "\n",
      ">>> I always konw what I want to do.\n",
      "['How do you do', 'Suppose you got to do. soon', 'Really-- if ?y', \"Are you saying 'No' just to be negative?\", 'Why do you tell me you were ?y now?', 'Do you often feel ?y ?']\n",
      "\n",
      ">>> After the accident I was  so depressed.\n",
      "['How do you do', 'Suppose you got ?y soon', 'Do you really think its likely that ?y', \"Are you saying 'No' just to be negative?\", 'Perhaps I already knew you were so depressed.', 'Do you often feel ?y ?']\n",
      "\n",
      ">>> After the vacation I feel  so good.\n",
      "['How do you do', 'Why do you want ?y', 'Do you really think its likely that ?y', 'why not?', 'Perhaps I already knew you were ?y', 'Do you often feel so good. ?']\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 生成对话\n",
    "\n",
    "import random\n",
    "\n",
    "\n",
    "def get_response(saying, response_rules):\n",
    "    responses = []\n",
    "    for pat, response_pats in response_rules.items():\n",
    "        match = pat_match_with_seg(pat.split(), saying.split())\n",
    "        if match:\n",
    "            match_dic = pat_to_dict(match)\n",
    "            response_pat = random.choice(response_pats)\n",
    "            responses.append(' '.join(\n",
    "                subsitite(response_pat.split(), match_dic)))\n",
    "    return responses\n",
    "\n",
    "\n",
    "rule_responses = {\n",
    "    '?*x hello ?*y': ['How do you do', 'Please state your problem'],\n",
    "    '?*x I want ?*y': [\n",
    "        'what would it mean if you got ?y', 'Why do you want ?y',\n",
    "        'Suppose you got ?y soon'\n",
    "    ],\n",
    "    '?*x if ?*y': [\n",
    "        'Do you really think its likely that ?y', 'Do you wish that ?y',\n",
    "        'What do you think about ?y', 'Really-- if ?y'\n",
    "    ],\n",
    "    '?*x no ?*y': [\n",
    "        'why not?', 'You are being a negative',\n",
    "        'Are you saying \\'No\\' just to be negative?'\n",
    "    ],\n",
    "    '?*x I was ?*y': [\n",
    "        'Were you really', 'Perhaps I already knew you were ?y',\n",
    "        'Why do you tell me you were ?y now?'\n",
    "    ],\n",
    "    '?*x I feel ?*y':\n",
    "    ['Do you often feel ?y ?', 'What other feelings do you have?']\n",
    "}\n",
    "\n",
    "sayings = \"\"\"\n",
    "hello hello world!\n",
    "Now I want to fly.\n",
    "I always konw what I want to do.\n",
    "After the accident I was  so depressed.\n",
    "After the vacation I feel  so good.\n",
    "\"\"\"\n",
    "\n",
    "for saying in sayings.strip().split('\\n'):\n",
    "    print('>>> ' + saying)\n",
    "    print(get_response(saying, response_rules=rule_responses))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### [Probability Based](#目录)\n",
    "- 基于规则的模型，太复杂，性能较差\n",
    "- 基于概率的语言模型：\n",
    ">两句话：今天早上吃晚饭的时候；今天早上吃早饭的时候，前者在真实语境中出现的**概率**更高\n",
    "- $sent=w_1 w_2 w_3 w_4$，将句子拆解成单词，句子出现的概率可以转换成单词出现的概率：\n",
    ">$ P(w_1 w_2 w_3 w_4) = P(w_1 | w_2 w_3 w_ 4)*P(w2 | w_3 w_4)*P(w_3 | w_4)* P(w_4)$\n",
    "$ P(w_1 w_2 w_3 w_4) \\sim Pr(w_1 | w_2 )*P(w2 | w_3 )*P(w_3 | w_4)*P(w_4)$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 训练语言模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import random\n",
    "from collections import Counter\n",
    "from functools import reduce\n",
    "import operator\n",
    "import pandas as pd\n",
    "import jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>author</th>\n",
       "      <th>source</th>\n",
       "      <th>content</th>\n",
       "      <th>feature</th>\n",
       "      <th>title</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>89617</td>\n",
       "      <td>NaN</td>\n",
       "      <td>快科技@http://www.kkj.cn/</td>\n",
       "      <td>此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...</td>\n",
       "      <td>{\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"37\"...</td>\n",
       "      <td>小米MIUI 9首批机型曝光：共计15款</td>\n",
       "      <td>http://www.cnbeta.com/articles/tech/623597.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>89616</td>\n",
       "      <td>NaN</td>\n",
       "      <td>快科技@http://www.kkj.cn/</td>\n",
       "      <td>骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...</td>\n",
       "      <td>{\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"15\"...</td>\n",
       "      <td>骁龙835在Windows 10上的性能表现有望改善</td>\n",
       "      <td>http://www.cnbeta.com/articles/tech/623599.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>89615</td>\n",
       "      <td>NaN</td>\n",
       "      <td>快科技@http://www.kkj.cn/</td>\n",
       "      <td>此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\\r\\n...</td>\n",
       "      <td>{\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"18\"...</td>\n",
       "      <td>一加手机5细节曝光：3300mAh、充半小时用1天</td>\n",
       "      <td>http://www.cnbeta.com/articles/tech/623601.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>89614</td>\n",
       "      <td>NaN</td>\n",
       "      <td>新华社</td>\n",
       "      <td>这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\\r\\n</td>\n",
       "      <td>{\"type\":\"国际新闻\",\"site\":\"环球\",\"commentNum\":\"0\",\"j...</td>\n",
       "      <td>葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）</td>\n",
       "      <td>http://world.huanqiu.com/hot/2017-06/10866126....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>89613</td>\n",
       "      <td>胡淑丽_MN7479</td>\n",
       "      <td>深圳大件事</td>\n",
       "      <td>（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\\r\\n@深圳交警微博称：昨日清...</td>\n",
       "      <td>{\"type\":\"新闻\",\"site\":\"网易热门\",\"commentNum\":\"978\",...</td>\n",
       "      <td>44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随</td>\n",
       "      <td>http://news.163.com/17/0618/00/CN617P3Q0001875...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id      author                  source  \\\n",
       "0  89617         NaN  快科技@http://www.kkj.cn/   \n",
       "1  89616         NaN  快科技@http://www.kkj.cn/   \n",
       "2  89615         NaN  快科技@http://www.kkj.cn/   \n",
       "3  89614         NaN                     新华社   \n",
       "4  89613  胡淑丽_MN7479                   深圳大件事   \n",
       "\n",
       "                                             content  \\\n",
       "0  此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...   \n",
       "1  骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...   \n",
       "2  此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\\r\\n...   \n",
       "3    这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\\r\\n   \n",
       "4  （原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\\r\\n@深圳交警微博称：昨日清...   \n",
       "\n",
       "                                             feature  \\\n",
       "0  {\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"37\"...   \n",
       "1  {\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"15\"...   \n",
       "2  {\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"18\"...   \n",
       "3  {\"type\":\"国际新闻\",\"site\":\"环球\",\"commentNum\":\"0\",\"j...   \n",
       "4  {\"type\":\"新闻\",\"site\":\"网易热门\",\"commentNum\":\"978\",...   \n",
       "\n",
       "                           title  \\\n",
       "0           小米MIUI 9首批机型曝光：共计15款   \n",
       "1     骁龙835在Windows 10上的性能表现有望改善   \n",
       "2      一加手机5细节曝光：3300mAh、充半小时用1天   \n",
       "3  葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）   \n",
       "4       44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随   \n",
       "\n",
       "                                                 url  \n",
       "0     http://www.cnbeta.com/articles/tech/623597.htm  \n",
       "1     http://www.cnbeta.com/articles/tech/623599.htm  \n",
       "2     http://www.cnbeta.com/articles/tech/623601.htm  \n",
       "3  http://world.huanqiu.com/hot/2017-06/10866126....  \n",
       "4  http://news.163.com/17/0618/00/CN617P3Q0001875...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 原始语料：新闻数据\n",
    "\n",
    "file_path = 'datasets/sqlResult_1558435.csv'\n",
    "content = pd.read_csv(file_path, encoding='utf-8')\n",
    "content.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 文本处理，删除非文字\n",
    "\n",
    "articles = content['content'].tolist()\n",
    "\n",
    "\n",
    "def token(string):\n",
    "    return re.findall('\\w+', string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['在外国名著',\n",
       " '麦田里的守望者',\n",
       " '中',\n",
       " '作者想要守护麦田里如自己内心一般纯真的孩子们',\n",
       " '而驻村干部们',\n",
       " '也在这个炎热的夏天里撸袖子上阵',\n",
       " '真正做起了村民们的',\n",
       " '麦田守望者',\n",
       " '三夏时节不等人',\n",
       " '你看到了吗',\n",
       " '不停翻涌起伏',\n",
       " '仿若铺陈至天边的金黄麦浪中',\n",
       " '那若隐若现的人影',\n",
       " '是自治区新闻出版广电局驻和田市肖尔巴格乡合尼村工作队的队员与工作队组织的青年志愿者',\n",
       " '在这个炎热的夏季',\n",
       " '他们深入田间地头',\n",
       " '帮助村民们收割小麦',\n",
       " '扛起收麦机',\n",
       " '麦田中的每个人都显得兴致勃勃',\n",
       " '一天下来就近22亩小麦收割完毕',\n",
       " '志愿者麦麦提亚森擦去满脸的汗水',\n",
       " '高兴地告诉驻村队员',\n",
       " '我们青年志愿者应该多做贡献',\n",
       " '为村里的脱贫致富出把力',\n",
       " '工作队带着我们为村里的老人服务',\n",
       " '看到那些像我爷爷奶奶一样的老人赞许感谢的目光',\n",
       " '我体会到了帮助他人的快乐',\n",
       " '自治区新闻出版广电局驻村工作队孙敏',\n",
       " '艾力依布拉音',\n",
       " '麦收时节',\n",
       " '我们在一起',\n",
       " '6月中旬的和田墨玉',\n",
       " '麦田金黄',\n",
       " '静待收割',\n",
       " '6月14日',\n",
       " '15日两天',\n",
       " '自治区高级人民法院驻和田地区墨玉县吐外特乡罕勒克艾日克村工作队与48名村民志愿者一道',\n",
       " '帮助村里29户有需要的村民进行小麦收割工作',\n",
       " '田间地头',\n",
       " '罕勒克艾日克村志愿队的红旗迎风飘扬',\n",
       " '格外醒目',\n",
       " '10余台割麦机一起轰鸣',\n",
       " '男人们在用机器收割小麦的同时',\n",
       " '几名妇女也加入到志愿队',\n",
       " '构成了一道美丽的麦收风景',\n",
       " '休息空闲',\n",
       " '工作队员和村民们坐在树荫下',\n",
       " '田埂上',\n",
       " '互相问好',\n",
       " '聊天',\n",
       " '语言交流有困难',\n",
       " '就用手势',\n",
       " '动作比划着聊天',\n",
       " '有趣地交流方式不时引来阵阵欢笑',\n",
       " '大家在一同享受丰收和喜悦',\n",
       " '也一同增进着彼此的情感和友谊',\n",
       " '自治区高级人民法院驻村工作队周春梅',\n",
       " '艾地艾木',\n",
       " '阿不拉',\n",
       " '细看稻菽千重浪',\n",
       " '6月15日',\n",
       " '自治区煤田灭火工程局的干部职工们再一次跋涉1000多公里来到了叶城县萨依巴格乡阿亚格欧尔达贝格村',\n",
       " '见到了自己的亲戚',\n",
       " '现场处处都透出掩盖不住的喜悦',\n",
       " '一声声亲切的',\n",
       " '谢谢',\n",
       " '一个个结实的拥抱',\n",
       " '都透露出浓浓的亲情',\n",
       " '没坐一会儿',\n",
       " '在嘘寒问暖中大家了解到在麦收的关键时刻',\n",
       " '部分村民家中却存在收割难的问题',\n",
       " '小麦成熟期短',\n",
       " '收获的时间集中',\n",
       " '天气的变化对小麦最终产量的影响极大',\n",
       " '如果不能及时收割',\n",
       " '会有不小损失的',\n",
       " '于是',\n",
       " '大家几乎立刻就决定要帮助亲戚们收割麦子',\n",
       " '在茂密的麦地里',\n",
       " '干部们每人手持一把镰刀',\n",
       " '一字排开',\n",
       " '挽起衣袖',\n",
       " '卷起裤腿',\n",
       " '挥舞着镰刀进行着无声的竞赛',\n",
       " '骄阳似火',\n",
       " '汗如雨下',\n",
       " '但这都挡不住大家的热情',\n",
       " '随着此起彼伏的镰刀割倒麦子的',\n",
       " '刷刷',\n",
       " '声响',\n",
       " '不一会',\n",
       " '一束束沉甸甸的麦穗就被整齐地堆放了起来',\n",
       " '当看到自己亲手收割的金黄色麦穗',\n",
       " '被一簇簇地打成捆运送到晒场',\n",
       " '每个人的脸上都露出了灿烂的笑容',\n",
       " '自治区煤田灭火工程局驻村工作队马浩南',\n",
       " '这是一个收获多多的季节',\n",
       " '6月13日清晨6时许',\n",
       " '和田地区民丰县若雅乡特开墩村的麦田里已经传来马达轰鸣声',\n",
       " '原来是自治区质监局驻村工作队趁着天气尚且凉爽',\n",
       " '开始了麦田的收割工作',\n",
       " '忙碌间隙',\n",
       " '志愿者队伍搬来清凉的水',\n",
       " '村民们拎来鲜甜的西瓜',\n",
       " '抹一把汗水',\n",
       " '吃一牙西瓜',\n",
       " '甜蜜的汁水似乎流进了每一个人的心里',\n",
       " '说起割麦子',\n",
       " '对于生活在这片土地上的村民来说是再平常不过的事',\n",
       " '但是对于工作队队员们来说却是陌生的',\n",
       " '自治区质监局驻民丰县若克雅乡博斯坦村工作队队员们一开始觉得十几个人一起收割二亩地应该会挺快的',\n",
       " '结果却一点不简单',\n",
       " '镰刀拿到自己手里割起来',\n",
       " '考验才真正的开始',\n",
       " '大家弓着腰',\n",
       " '弯着腿',\n",
       " '亦步亦趋',\n",
       " '手上挥舞着镰刀',\n",
       " '时刻注意不要让镰刀割到自己',\n",
       " '脚下还要留心不要把套种的玉米苗踩伤',\n",
       " '不一会儿',\n",
       " '就已经汗流浃背了',\n",
       " '抬头看看',\n",
       " '身边的村民早就远远地割到前面去了',\n",
       " '只有今年已经56岁的工作队队长李树刚有割麦经验',\n",
       " '多少给队员们挽回了些',\n",
       " '面子',\n",
       " '赶不上村民们割麦子的速度',\n",
       " '更不要说搞定收割机这台大家伙了',\n",
       " '现代化的机械收割',\n",
       " '能成倍提升小麦的收割速度',\n",
       " '李树刚说',\n",
       " '不过',\n",
       " '能有这样的体验',\n",
       " '拉近和村民的距离',\n",
       " '也是很难得的体验',\n",
       " '自治区质监局驻村工作队王辉',\n",
       " '马君刚',\n",
       " '我们是麦田的守护者',\n",
       " '为了应对麦收',\n",
       " '新疆银监局驻和田县塔瓦库勒乡也先巴扎村工作队一早就从经济支援和人力支援两方面做好了准备',\n",
       " '一方面',\n",
       " '工作队帮村里购入了5台小麦收割机',\n",
       " '另一边',\n",
       " '还组织村干部',\n",
       " '青年团员等组成了6支近百人的',\n",
       " '收割先锋突击队',\n",
       " '帮助村民们抢收麦子',\n",
       " '看着及时归仓的麦子',\n",
       " '村民们喜得合不拢嘴',\n",
       " '纷纷摘下自家杏树上的杏子送给工作队',\n",
       " '金黄的麦穗温暖了村民们的心',\n",
       " '香甜的杏子温暖了工作队员的心',\n",
       " '麦子加杏子',\n",
       " '拉近了村民和队员们的心',\n",
       " '新疆银监局驻村工作队王继发',\n",
       " '免责声明',\n",
       " '本文仅代表作者个人观点',\n",
       " '与环球网无关',\n",
       " '其原创性以及文中陈述文字和内容未经本站证实',\n",
       " '对本文以及其中全部或者部分内容',\n",
       " '文字的真实性',\n",
       " '完整性',\n",
       " '及时性本站不作任何保证或承诺',\n",
       " '请读者仅作参考',\n",
       " '并请自行核实相关内容']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example = articles[110]\n",
    "token(example)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "89611"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_articles = [''.join(token(str(a)))for a in articles]\n",
    "len(cleaned_articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache /tmp/jieba.cache\n",
      "Loading model cost 0.432 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "# 分词生成标记\n",
    "tokens = []\n",
    "for article in cleaned_articles:\n",
    "    tokens += list(jieba.cut(article))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('的', 703716),\n",
       " ('n', 382020),\n",
       " ('在', 263597),\n",
       " ('月', 189330),\n",
       " ('日', 166300),\n",
       " ('新华社', 142462),\n",
       " ('和', 134061),\n",
       " ('年', 123106),\n",
       " ('了', 121938),\n",
       " ('是', 100909)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 统计词频\n",
    "words_count = Counter(tokens)\n",
    "words_count.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD2CAYAAADcUJy6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deXxc5X3v8c9Poxkto8WWNJJXWd7wArYBG8waFANJAw00gZQkpFkILyc05LY3TVtomttA0jRNbm9vbxaIKUlcQhMITcpSlpTYgNkMMlu8go0t77ZkWfs++t0/NAZbSNbukc58368XL585c87o92Dp60fPec5zzN0REZHgSUt2ASIiMjoU8CIiAaWAFxEJKAW8iEhAKeBFRAIqPdkFHFNUVORlZWXJLkNEZFzZsGFDtbvHentvzAR8WVkZFRUVyS5DRGRcMbPKvt7TEI2ISEAp4EVEAkoBLyISUEMKeDObbGaXmVnuSBckIiIjY0ABb2YlZrYusX0acB9wIfC0mUX6OGeqme01s6cS//V6lVdEREZHv7NozGwisBqIJnYtBj7n7jvMbBEwE9jWy6nLgb939ztGqlgRERm4gfTg48B1QD2Auz8AVJrZlcBEYHsf550H3Ghmr5jZt3s7wMxWmlmFmVVUVVUNvnoREelTvwHv7vXuXtdjdw7wx0Al0Nd6w48B5cA5wPlmtriXz17l7svcfVksNrQRnK0H6/neE1upbW4f0vkiIkE1pIus7l7r7p8BwnQHeG+ed/cGd48DrwJzh1jjSVUeaeaHa3ew92jLaHy8iMi4NeiAN7M7zOx9iZcTgNo+Dn0iMdsmG/gAsHGINZ5ULDcDgKqGttH4eBGRcWsoSxV8F7jHzBz4rbtvM7MVwEJ3/8Fxx90GrAXagTvdvbcLscMWy1HAi4j0ZsAB7+7liT93Ahf1eG8NsKbHvrXA/OGXeHLv9OAbFfAiIscb93eyZoZD5GWmqwcvItLDuA946O7FH25oTXYZIiJjSmACXj14EZETBSTgMxXwIiI9BCPgc9SDFxHpKRABX5yXQVN7nKa2zmSXIiIyZgQi4DUXXkTkvYIR8JoLLyLyHsEKePXgRUTeoYAXEQmoQAR8QXaEUJrpZicRkeMEIuDT0oyinIh68CIixwlEwIPuZhUR6Sk4AZ+ToVk0IiLHCU7AqwcvInKCQAV8dWM78a6+HhErIpJaAhPwxbmZxLuco3r4togIEKCA11x4EZETDSngEw/TvszMcke6oKFSwIuInGhAAW9mJWa2LrF9GnAfcCHwtJlF+jgnbGYPm9lzZnbDiFXch2MLjh1WwIuIAAMIeDObCKwGooldi4HPufttwNvAzD5O/TKwwd0vBK4d7d6+evAiIicaSA8+DlwH1AO4+wNApZldCUwEtvdxXjlwf2L7GWBZzwPMbKWZVZhZRVVV1SBLP1E0I53sSEgBLyKS0G/Au3u9u9f12J0D/DFQCfQ1LzEK7Ets1wAlvXz2Kndf5u7LYrHYwKvuQ3GubnYSETlmSBdZ3b3W3T8DhIFz+jisEchKbOcM9WsNRvfNTlpwTEQEhhC6ZnaHmb0v8XICUNvHoRuAixLbS4Bdg65ukGK5GbrIKiKSkD6Ec74L3GNmDvzW3beZ2Qpgobv/4LjjVgOPmtnFwEJg/fDLPblYTgbrGqpH+8uIiIwLAw54dy9P/LmTd3vmx95bA6zpsa/SzC5PHPu/3D0+7Gr7EcvNoKG1k9aOOJnh0Gh/ORGRMW1Ux8Xdfb+739/LRdpRUZybCWiqpIgIBGipAnh3Lrye7CQiErCAn16QDcCu6uYkVyIiknyBCvgZhdmkpxlvVzcmuxQRkaQLVMCHQ2mUFmaz43BTsksREUm6QAU8wOxYDjuq1IMXEQlkwO860kRnvCvZpYiIJFXgAn5WLEpH3NlztCXZpYiIJFXgAn52LAeAHYc1TCMiqS2AAd+9bL3G4UUk1QUu4CdkRyjKifB2lWbSiEhqC1zAA8zSTBoRkWAGvKZKiogENuCjHG3uoKapPdmliIgkTTADvjgxk0a9eBFJYYEM+DmaKikiEsyAnzIhi0h6mnrwIpLSAhnwoTRjVlFUUyVFJKUNKODNrMTM1iW2S83sKTNbY2arzMz6OGeqme1NHPuUmcVGsvD+aCaNiKS6fgPezCbS/QDtaGLXF4Cb3H0FMB1Y1Mepy4G/d/fyxH9VI1HwQM2ORdld00xb56g/ClZEZEwaSA8+DlwH1AO4+9fcfUvivUKguo/zzgNuNLNXzOzbw650kGYX59DlUHlET3cSkdTUb8C7e31vD802s+uATe6+v49THwPKgXOA881scS+fsdLMKsysoqpqZDv4xxYde+uQhmlEJDUN6SKrmc0Cvgr8+UkOe97dG9w9DrwKzO15gLuvcvdl7r4sFhvZIfo5xTmYwZuHGkb0c0VExotBB3xiTP4XwA299eyP84SZTTazbOADwMYh1jgkmeEQZYVRth1UwItIakofwjm3AKXA9xMTaP4OCAEL3f0Hxx13G7AWaAfudPdtw6x10OaV5KoHLyIpa8AB7+7liT//GvjrXg5Z0+P4tcD84RQ3XPMm5fLbzQdp7YiTGQ4lsxQRkVMukDc6HTNvUi5drgutIpKaAh/wANs0TCMiKSjQAV9WGCWSnsa2g/XJLkVE5JQLdMCH0oy5xTls1UwaEUlBgQ546B6m0UwaEUlFgQ/4+ZNyOVTfRm2znu4kIqkl8AF/Wkn3hVYN04hIqgl8wM+flAdoyQIRST2BD/iSvAzyMtPVgxeRlBP4gDcz5k/K400FvIikmMAHPHTPpNl2qAF3T3YpIiKnTEoE/GmTcmlo7WR/XWuySxEROWVSIuDnH1uyQHe0ikgKSYmAXzg5j+xIiEd/fzDZpYiInDIpEfDRjHQ+tnQaD762j8P1GqYRkdSQEgEP8LkLZ9LZ5dzzYmWySxEROSVSJuDLiqJcvqCEn79YSUt7PNnliIiMupQJeIAbL57F0eYOfv3q3mSXIiIy6lIq4M8pm8jiafncvW4nXV2aEy8iwTaggDezEjNbl9guNbOnzGyNma2yxJO3ezknbGYPm9lzZnbDSBY9VGbG5y+aydvVTazddjjZ5YiIjKp+A97MJgKrgWhi1xeAm9x9BTAdWNTHqV8GNrj7hcC1ZpY7AvUO2xWLJhPLzeCXL+9JdikiIqNqID34OHAdUA/g7l9z9y2J9wqB6j7OKwfuT2w/AyzreYCZrTSzCjOrqKqqGkzdQxYOpfGRs6ayduthjjS2nZKvKSKSDP0GvLvXu3tdz/1mdh2wyd3393FqFNiX2K4BSnr57FXuvszdl8VisUGUPTzXnD2Nzi7nwdf6Kl1EZPwb0kVWM5sFfBX485Mc1ghkJbZzhvq1RsO8SbksmprPAxs0m0ZEgmvQoZsYk/8FcENvPfvjbAAuSmwvAXYNurpRdO3SaWw+UM/m/VqfRkSCaSi96luAUuD7idk0l5jZCjO7ucdxq4HbzOxfgIXA+mHWOqKuWjKFcMj4j1fUixeRYBpwwLt7eeLPv3b3ye5envjvaXdf4+4/6HF8JXA58BxwmbuPqdtHJ0YjXDq/hAdf20dHvCvZ5YiIjLhRHRd39/3ufn8/QzlJc83SaVQ3trN2q+bEi0jwjJkLn8lQPi/G1AlZfO0/N7KruinZ5YiIjKiUDvhwKI2ffe4cOuNdXP+v69lf25LskkRERkxKBzzA3JJc7vn8cupbOrj+X9dzuEHrxYtIMKR8wAOcMTWfn91wDgfrWrn53le1EJmIBIICPmHpjAJuv/p0XtpVw0+f35XsckREhk0Bf5xrl07jsgXFfPfxrWw/3JjsckREhkUBfxwz49sfXURWJMRf/Op1OjU/XkTGMQV8D8W5mdx+9Rm8vqeWlfds4MdP72DN1kPUtXQkuzQRkUFJT3YBY9GHF09m0746fvPqPtYkboI6p2wiv/riBUmuTERk4BTwvTAzbr1iAbdesYC65g7+35q3+MlzOznc0EpxbmayyxMRGRAN0fQjPzvMtUun4Q5rtmhJAxEZPxTwAzB/Ui7TJmbx5JZDyS5FRGTAFPADYGZctqCEdW9V09I+phbFFBHpkwJ+gC5fWEJbZxfPbu/rEbQiImOLAn6Azp1ZQG5mOk9u1jCNiIwPCvgBCofSKJ9XzO+2HtJaNSIyLijgB+GyBcVUN7bz2t7aZJciItKvAQW8mZWY2brjXi8wswf7OeciM9ueeG7r74Zb6FhQflox6WmmYRoRGRf6DXgzm0j3A7Sjidezge8B+f2cei7w5cRzWy8dbqFjQX52mHNnFvDYxoMaphGRMW8gPfg4cB1Qn3jdAFwzgPPOA75uZq+a2c1DrG/M+diyaeysbtJsGhEZ8/oNeHevP/6h2e5+2N3bBvDZPwXeB5wP3GRmhT0PMLOVZlZhZhVVVVWDqTtprlg0maKcDH6mNeNFZIwbzYusa9y9091bgW1AWc8D3H2Vuy9z92WxWGwUSxk5GekhPrm8lLXbDutB3SIypo1KwJuZAc+aWY6ZlQBnAdtH42slw6eWlxIy499eqEx2KSIifRqRgDezT5rZtcdeu7sD/wS8CjwO/OXxwzzjXXFeJlcunsyvKvbQ2NaZ7HJERHo14IB39/K+Xrv7v7v7Az3e/6W7z3X3s3q+FwSfvaCMhrZOfv3K3mSXIiLSK93oNERnlU5kybR8frR2BxW7apJdjojIeyjgh+H2q88glGZ87Mcv8I2HNtGk4RoRGUOse7g8+ZYtW+YVFRXJLmPQmto6+d4T21j9wi4KoxlctWQKH14ymTOnT6D7WrOIyOgxsw3uvqzX9xTwI2NDZQ0/fvptntpWRXu8iwWT83jgi+cTzdBTEUVk9Jws4DVEM0KWzihg1aeXUfH1y7jtqtPZcqCee9drGqWIJI8CfoTlZYb5zAVlXDSniFXP7KS1Q0+AEpHkUMCPki+vmEN1Yxv3vbwn2aWISIpSwI+S5bMKObesgDuf3kF7Z1eyyxGRFKSAH0U3r5jDgbpW/kM3Q4lIEijgR9HFc4u6b4Z6aju1ze3JLkdEUowCfhSZGV/94Dz217Zy+T8/w+MbDya7JBFJIQr4UXbx3BgPfulCYjkZfPHnG/jCPRXc9/JuNlQepa6lI9nliUiA6UanU6Qj3sWPn97BD9fuoOW4qZNFORnMjkWZPymXP7vsNAqikSRWKSLjzcludNJtlqdIOJTGzSvmclP5HPbUNLP9cCPbqxp5u6qRHVVN3Lt+N+1x5x8+uijZpYpIQCjgT7FQmlFWFKWsKMpllLyz/+v/uZFfvrybm1fMYeqErCRWKCJBoTH4MeKL5bMBuPOpHUmuRESCQgE/RkydkMW1S6dz38t7OFjXmuxyRCQAFPBjyJ+Wz6bLnTufVi9eRIZPY/BjyPSCbD5y1lR+8dJuLphdSH5WmGhGOlMmZGl2jYgM2oAC3sxKgAfc/eLE6wXAd9z96pOcMwF4CAgB33L3x0ag3sD70vvn8MgbB1h5z4YT9hdEI8wpzuGzF5RxxaLJSapORMaTfgPezCYCq4Fo4vVs4HtATj+n3g78BLgHeNLMHvexMul+DCsrivL0X5Wz92gLLe1xGts635lWWVF5lD+99xX+9soF3HjxrGSXKiJj3EB68HHgOuDBxOsG4BrgiX7Oex/wN+4eN7NtQBmw8/gDzGwlsBKgtLR04FUHXHFuJsW5me/Z39oR5yv3v8a3/msLh+pbufVDC0hL02MBRaR3/Qa8u9cD7zxf1N0PH//6JDrdvTGxXQOU0CPg3X0VsAq672QdRN0pKTMc4vufOJuinE3ctW4nv3l1HzkZ6WRF0plbnMMfLp7MJfNiZKSHkl2qiIwBo3mR9fhHGeWgGTsjIpRm3HbV6Zw+JY9Xd9fS3B6nub2TdW9V8dDr+8nNTOfj50znLz4wj8ywgl4klY1mwG8ys2XuXgEsAf5xFL9WSjEzrjunlOvOeXdYqyPexfM7jvDrV/Zy17qdvPD2EX70yaWUFmYnsVIRSaYR6VWb2SfN7Noeu+8A7jazVUCDu+8bia8lvQuH0rjktBj/8vGzuOvTy9h9pJkrv7+OJzZpiWKRVDWqq0ma2RzgTOBhd2872bFBX03yVNtT08zN//4Kr++t43MXlnHrhxYQSdcomUjQnGw1yVH9iXf37e7+QH/hLiNvekE293/xfD53YRk/fW4X1975PJVHmpJdloicQloPPgU8vvEgf/nA6zS0djIrFuWs6RNZPrOAP1wymeyIbmYWGc9O1oNXwKeIfbUt/OaVvby2p5bX9tRS3dhOflaY65eX8pkLyijJe++8exEZ+xTwcgJ3Z0PlUe5+didPbDpIJD2Nf/30OVw0tyjZpYnIICVtDF7GJjNjWVkBd3xqKWu/Wk5ZYZQb/+1lnt9RnezSRGQEKeBT3IzCKPfeuJzSgmw+/7MKXnz7SLJLEpERoiEaAaCqoY1P3PUiO6ubiEZCpIfSyMlI58aLZ3L98hmEtOaNyJikMXgZkKqGNlY/v4vGtk46u7p482AjL+2qYdHUfL75R2dw5vQJyS5RRHpQwMuQuDuPvHGAbz6ymarGNn762XMon1ec7LJE5Di6yCpDYmZ8eMkUfvcXl1BakM13H9/GWOkQiEj/FPDSr9zMMH926Vw2H6jX2jYi44gCXgbkqiVTmBWL8s///RZdXerFi4wHCngZkPRQGn926Vy2HWrgv35/INnliMgAKOBlwP5w8RTmFufwf598k7h68SJjngJeBiyUZvzPy09jR1UT33lsi0JeZIxTwMug/MHpk/jEuaXctW4nn1/9MnXNHckuSUT6oICXQUlLM/7ho4v4+4+cwXPbq7nqh8/y+711yS5LRHqhgJchuX75DH658jxaO+Jc/cNnuf3hzTS1dSa7LBE5ju5klWGpa+nge09s5d71u5mUl8knzi1l8bR8Fk3NpzAnI9nliQTesJcqMLMS4AF3v9jMwsCvgQLgbnf/SR/nTAXWA9sTuz7m7lV9fQ0F/Pi2ofIotz+ymdf31L6zrygnwvSCbGYUZDMhO0I4ZKSH0phXksuViycTDukXSJHhGlbAm9lE4BdAsbufbWZfAfLc/Rtm9ihwnbs39HLeR4ESd79jIEUq4IOhvrWDTfvq+f2+Wt6uamJ3TTOVR5qpb+2gM+50xLvo7HKmF2Txp+VzuObsaXoYuMgwDDfg8wADHnT3cjN7CLjF3Teb2S3Aendf28t53wUuTZz7uLv/TS/HrARWApSWli6trKwcZNNkvHF3ntxymO+veYs39taRl5nOuTMLOW9WARfNLWJeSS5mWppYZKBOFvD9PnHZ3esTH3JsVxTYl9iuAUr6OPUx4JtAM/CkmS129zd6fPYqYBV09+D7q0XGPzPj8oUlXLagmGfequbRNw7w4s4jPLnlEAAzi6JcsWgSHzlrKnOKc5Ncrcj41m/A96IRyALqgJzE69487+5tAGb2KjAXeKOPYyXFmBmXnBbjktNiAByoa2HN1sM8+vsD3PHUDu56Zif3feE8ziqdmORKRcavoQx+bgAuSmwvAXb1cdwTZjbZzLKBDwAbh/C1JEVMzs/i+uUzuPfG83jh1kspyc/gpp+/QlVDW7JLExm3hhLwq4HbzOxfgIXAejNbYWY39zjuNmAt8CJwp7tvG16pkipK8jL58aeWUdvSzpfufYWOeFeySxIZlwYc8O5envizErgceA64zN3j7r7G3X/Q4/i17j7f3Rf3fE+kPwun5PGP1yzmpV01fOuRzXrQiMgQDGUMHnffD9w/wrWInODqM6fyxt467n52J5H0NG790ALS9PBvkQEbUsCLnCpfu2IBnfEu7lq3kyON7fzjtYt1g5TIACngZUxLSzO+cdXpFOVk8E///SZVjW3ccOFMzp1ZQDRD374iJ6OfEBnzzIwvXzqXwpwMbnt4E+veqiYcMs4qnchlC4r5wMJJlBVFk12myJijxcZkXGntiFOx6yjPbq/m6Ter2HKgHoBZsSiT8zPJjqSTm5nO2aUTef/8YqZOyEpyxSKja9iLjZ0KCngZij01zTy55RDPvlVNbUsHTW2d1DS1czgxf/60khw+f9FMPrZ0ui7QSiAp4CWluDs7qpp4atthHn59P6/vrWPJtHxuu/oMzpw+IdnliYwoBbykLHfnP1/bx7cf3UpVQxsleRlkhUNkhkMsnpbPH505leWzCgmpdy/jlAJeUl5Dawern9/F3qMttHTEaWjtZP3bR2hqjzMpL5PzZxcyvSCb0oJsFk3N57SSHK1qKePCsFaTFAmC3MwwN6+Ye8K+lvY4T245xEOv7+elnTU8+No+uhL9nZlFUT54+iSWzyqgIDtCQTTCxGiEaCSk4JdxQz14kYT2zi721bbw/I5qHt94kBd2HKGz68Sfj0h6GkXRCPnZETLDaWSmh8jLSmdWLIfZsRxmFnXP5inOzSBdN2TJKaAevMgARNLTmFkUZWZRlOuXz6C2uZ3thxs52tzB0eZ2jja1U9PUTnVjO3Ut7bR2dNHaEWdHVRNrth6mI/7uPwZp1r1C5vvnx7hi0WSWz9Q4v5x6CniRPkzIjrCsrGBAx3bEu9hT08yuI00cqGvlUF0rbx5q5IENe/n5i7vJzwqTm5mOGaSZkZORTn5WmPysMBOyI0zMDjMxO0JmJEQkZIRDacwpzuH0Kfn6h0GGTAEvMgLCoTRmxXKYFcs5YX9zeydrt1bx7PYq2jq6cCDe5TS2dVLf0sFbhxupbe6gtrn9PcNBALmZ6SyfWciK+cVcsWgSE7Ijp6hFEgQagxcZA9ydhrZO2jq66Ih3D/1s3F/PCzuqeXZ7NXtqWgiHup+CddGcImYURZlRkE1JXiZZ4ZBu4kphGoMXGePMjLzMMGS+u29WLIerlkzB3dm0v56HXt/Pw6/v58kth99zflY4RHFeBounTWDJtHyWlRWweGq+gj/FqQcvMo64O9WN7eyuaWJXdTPVjW00t8dpbu9kT00Lb+ytZX9dKwCT8jL5gzMmUT4vxozC7tk9meFQklsgI009eJGAMDNiuRnEcjNYOqP3C8CH61t5dns1j208yC9e2s3Pnt/1znuF0QiT8jOZnJ/JjMIoVy2ZwuJp+ZrbH1DqwYsEWFNbJ2/sreNAXQv7a1vYV9vKwboWDtS1srO6ibbOLk6fksfHzy3lykWTKYjqIu54M+ylCsysBHjA3S82szDwa6AAuNvdf9LHOQM67hgFvMipVd/awYOv7eff1+9my4F60gzOm1XIB0+fxOxYDiV5GRTnZpKbma6x/DFsWAFvZhOBXwDF7n62mX0FyHP3b5jZo8B17t7Qy3kDOu4YBbxIcrg7mw/U89jvD/LoxgO8XdX0nmOyIyGiGemE0wwze2c+f5p1P3UrIz1ENBIiOyOdGQXZLCubyDllBUzRevyjbrgBnwcY8KC7l5vZQ8At7r7ZzG4B1rv72l7O6/c4M1sJrAQoLS1dWllZOZT2icgIcXf2Hu0ezjnU0Mbh+lYaWjtpauukqb2TzrjT5d3HHZvTH3enraOL5vbu47YfbqSpPQ7A/Em5/Mn5M/jIWVPJjuiS32gY1kVWd69PfMixXVFgX2K7Bijp49R+j3P3VcAq6O7B91eLiIwuM2N6QTbTC7KH/Bmd8S62Hmxg/c4afv3KXr72m41857GtvH9eMUU5GRTmRN65szcnI53sSDpZkRCZ4TRyMtIpzs0kkq51fEbCUP5JbQSygDogJ/F6OMeJSICkh9I4Y2o+Z0zN54YLy3hl91FWP1/Jq3uOUtPY/k7vvi9mEMvpnil0bJkGAzIS6/jnZabz6fPLOHfmwJaRSGVDCfgNwEXAA8AS4MVhHiciAWVmLJ1RcMKUztaOOHUtHTS2dQ/pNCbu4G3piFPf0sGBulYO1LVQ3djOsSHkuENb4rxN++p45I0DXLloMrd8aP6wftsIuqEE/GrgUTO7GFgIrDezFcBCd//ByY4bdrUiMu5lJnrifY3t9qe5vZNVz7zNnU/v4IlNBxOLuBnhkHHTJbP57IUzR7Te8WxI8+DNbArdvfMn3L1uuMeBZtGIyOAcqGvh316opLG1E8d561Aj63fWcMOFM/nalQtSZhVOPbJPRAIv3uV885HN/Oz5XXzw9BK+cMlsIqE0wqE0siMh8rPD5ESCN6dfSxWISOCF0oxvXHU60wuy+dZ/beaJTYfec0yadT+O8bxZhZw/u5BZRTlE0rvX3y/MySAnI1iRqB68iATOW4ca2FvbQmfc6Yh3vbP+fm1zBxv31/Hyzpr3zOYxgzmxHJZMn8CS6RM4c9oE5k/OJTzGH72oHryIpJS5JbnMLcnt8/3OeBcb99dzsK6F9rh3P4/3aAuv761l7dbDPLBhLwAZ6WmcXTqRa5ZO44pFk8bdzVrqwYuIHOfY3byv7anltT21/G7LIXYdaSYnI51LTouRGQ6RZpCbGaZ8XozzZxcmtZevi6wiIkPk7ry0s4b7KvZQseso8S6ny52jzd0PXs/LTOfiuTGyI91r7aeHjPysCIXRCAXRCBOyw+Ql7tyNhNJIT0sjFDImZIWJjsCYv4ZoRESGyMxYPquQ5bMKT9jf2hFn3VvVPL7xIC/tOkJnvLuz3Nnl1Da30xHvv/OcFQ4Ry83g0+fP4MaLZ4147Qp4EZEhyAyHuHxhCZcvfO8tW8eesVvT2E5dSwf1rR3Ut3TS2dVFZ9zp7OqitrmDqoY2qhvbiOVmjEqNCngRkRF27Bm7eZnhpNYxtuf/iIjIkCngRUQCSgEvIhJQCngRkYBSwIuIBJQCXkQkoBTwIiIBpYAXEQmoMbMWjZlVAZVDPL0IqB7BcsaLVGx3KrYZUrPdqdhmGHy7Z7h7rLc3xkzAD4eZVfS12E6QpWK7U7HNkJrtTsU2w8i2W0M0IiIBpYAXEQmooAT8qmQXkCSp2O5UbDOkZrtTsc0wgu0OxBi8iIi8V1B68CIi0oMCXkQkoMZ9wJvZ3Wb2gpn9bbJrGU1mlm9mj5nZb83sN2YWSZW2A5hZiZm9mthOiXab2Y/M7MOJ7cC32cwmmtmjZlZhZj9O7AtsuxPf0+sS22Eze9jMnjOzG/raN1jjOuDN7KNAyN3PB2aZ2dxk1zSKrrexFh8AAAIrSURBVAf+j7t/ADgIfJzUaTvA/wayUuXv3MwuBia5+8Op0mbgT4B7E3PAc83srwhou81sIrAaiCZ2fRnY4O4XAteaWW4f+wZlXAc8UA7cn9j+LXBR8koZXe7+I3f/78TLGPApUqTtZrYCaKL7H7ZyAt5uMwsDdwG7zOxqUqDNCUeAM8xsAjAdmElw2x0HrgPqE6/LebetzwDL+tg3KOM94KPAvsR2DfDep98GjJmdD0wE9pACbTezCPB14JbErlT4O/80sBn4LnAu8CWC32aAZ4EZwP8AtgARAtpud69397rjdvX2fT3s7/XxHvCNQFZiO4fx356TMrMC4PvADaRO228BfuTutYnXqdDus4BV7n4Q+Dndvbegtxng74AvuvvtwFbgk6RGu6H37+thf6+P9/9hG3j317YlwK7klTK6Ej3ZXwG3unslqdP2y4AvmdlTwJnAhwl+u7cDsxLby4Aygt9m6P7NdJGZhYDlwHdIjXZD7z/Pw/4ZH9c3OplZHrAO+B3wIeC8Hr/2BIaZ3QR8G3g9seunwFdIgbYfkwj5qwj433niYtpP6P6VPEz3BfWHCHCbAczsXLq/r2cALwDXEPy/66fcvdzMZgCPAk8CFwDnAdN67nP3+KA+fzwHPLxzNfpy4JnEr7QpI1XbnortTsU2Q2q128ym0N1jf+LYP2S97RvUZ473gBcRkd6N9zF4ERHpgwJeRCSgFPAiIgGlgBcRCSgFvIhIQP1/JBTcA+7z4IgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# log 词频\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "frequiences = [f for w, f in words_count.most_common(100)]\n",
    "x = [i for i in range(100)]\n",
    "plt.plot(x, np.log(frequiences));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0011399627455473877"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 计算频率\n",
    "def prob_1(word):\n",
    "    return words_count[word] / len(tokens)\n",
    "\n",
    "\n",
    "prob_1(\"我们\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['此外自', '自本周', '本周6', '6月', '月12', '12日起', '日起除', '除小米', '小米手机', '手机6']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成 2-gram 单词组合\n",
    "tokens_2_gram = [''.join(tokens[i:i + 2]) for i in range(len(tokens[:-2]))]\n",
    "tokens_2_gram[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('n新华社', 74664),\n",
       " ('2017年', 61480),\n",
       " ('外代二线', 61301),\n",
       " ('日n', 52293),\n",
       " ('新华社照片', 50401),\n",
       " ('5月', 37977),\n",
       " ('4月', 34571),\n",
       " ('新华社记者', 30864),\n",
       " ('２０', 27166),\n",
       " ('日在', 27154)]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words_count_2 = Counter(tokens_2_gram)\n",
    "words_count_2.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2.852474416014339e-07, 5.704948832028678e-08)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2-gram 出现的概率\n",
    "\n",
    "def prob_2(word1, word2):\n",
    "    if word1 + word2 in words_count_2:\n",
    "        return words_count_2[word1 + word2] / len(tokens_2_gram)\n",
    "    else:\n",
    "        return 1 / len(tokens_2_gram)\n",
    "\n",
    "\n",
    "prob_2(\"在\", \"吃饭\"), prob_2(\"在\", \"吃水\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1.3018576470426152e-14, 3.254644117606538e-15)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 计算句子出现的概率\n",
    "\n",
    "\n",
    "def get_probablity(sentence):\n",
    "    words = list(jieba.cut(sentence))\n",
    "\n",
    "    sentence_pro = 1\n",
    "\n",
    "    for i, word in enumerate(words[:-1]):\n",
    "        next_ = words[i + 1]\n",
    "\n",
    "        probability = prob_2(word, next_)\n",
    "\n",
    "        sentence_pro *= probability\n",
    "\n",
    "    return sentence_pro\n",
    "\n",
    "\n",
    "get_probablity('小明在喝水'), get_probablity('小明在飞翔')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sentence: 这个篮球看着这个小猫 with Prb: 1.2711249998725007e-28\n",
      "sentence: 一个好看的篮球看见一个篮球 with Prb: 3.0448757946331477e-40\n",
      "sentence: 一个蓝色的小小的篮球坐在这个小小的女人 with Prb: 5.049246141565204e-56\n",
      "sentence: 这个小猫看着一个好看的蓝色的小猫 with Prb: 6.980077866099737e-52\n",
      "sentence: 一个小猫听着一个女人 with Prb: 3.6162430351070537e-32\n"
     ]
    }
   ],
   "source": [
    "# 结合规则生成的句子，获得句子的概率\n",
    "for sen in [\n",
    "        generate(gram=example_grammar, target='sentence') for i in range(5)\n",
    "]:\n",
    "    print('sentence: {} with Prb: {}'.format(sen, get_probablity(sen)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "今天晚上请你吃大餐，我们一起吃日料 is more possible\n",
      "---- 今天晚上请你吃大餐，我们一起吃日料 with probility 1.9877960696419972e-66\n",
      "---- 明天晚上请你吃大餐，我们一起吃苹果 with probility 1.5902368557135977e-66\n",
      "真是一只好看的小猫 is more possible\n",
      "---- 真事一只好看的小猫 with probility 4.6410899733664495e-34\n",
      "---- 真是一只好看的小猫 with probility 8.135199999184005e-27\n",
      "今晚我去吃火锅 is more possible\n",
      "---- 今晚我去吃火锅 with probility 3.4535695372779675e-20\n",
      "---- 今晚火锅去吃我 with probility 5.508208332780837e-28\n",
      "养乐多绿来一杯 is more possible\n",
      "---- 洋葱奶昔来一杯 with probility 1.8567578157408427e-22\n",
      "---- 养乐多绿来一杯 with probility 3.254644117606538e-15\n"
     ]
    }
   ],
   "source": [
    "# 粗略评估模型性能\n",
    "\n",
    "need_compared = [\n",
    "    \"今天晚上请你吃大餐，我们一起吃日料 明天晚上请你吃大餐，我们一起吃苹果\", \"真事一只好看的小猫 真是一只好看的小猫\",\n",
    "    \"今晚我去吃火锅 今晚火锅去吃我\", \"洋葱奶昔来一杯 养乐多绿来一杯\"\n",
    "]\n",
    "\n",
    "for s in need_compared:\n",
    "    s1, s2 = s.split()\n",
    "    p1, p2 = get_probablity(s1), get_probablity(s2)\n",
    "\n",
    "    better = s1 if p1 > p2 else s2\n",
    "\n",
    "    print('{} is more possible'.format(better))\n",
    "    print('-' * 4 + ' {} with probility {}'.format(s1, p1))\n",
    "    print('-' * 4 + ' {} with probility {}'.format(s2, p2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/yangbin7/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3058: DtypeWarning: Columns (0,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>link</th>\n",
       "      <th>name</th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>吴京意淫到了脑残的地步，看了恶心想吐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>中二得很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  id                                        link name  \\\n",
       "0  1  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "1  2  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "2  3  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "3  4  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "4  5  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "\n",
       "                                             comment star  \n",
       "0                                 吴京意淫到了脑残的地步，看了恶心想吐    1  \n",
       "1  首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...    2  \n",
       "2  吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...    2  \n",
       "3                      凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。    4  \n",
       "4                                               中二得很    1  "
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 豆瓣影评，重新训练语言模型\n",
    "content = pd.read_csv('datasets/movie_comments.csv', encoding='utf-8-sig')\n",
    "content.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "261497"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comments = content['comment'].tolist()\n",
    "len(comments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['吴京意淫到了脑残的地步看了恶心想吐',\n",
       " '首映礼看的太恐怖了这个电影不讲道理的完全就是吴京在实现他这个小粉红的英雄梦各种装备轮番上场视物理逻辑于不顾不得不说有钱真好随意胡闹',\n",
       " '吴京的炒作水平不输冯小刚但小刚至少不会用主旋律来炒作吴京让人看了不舒服为了主旋律而主旋律为了煽情而煽情让人觉得他是个大做作大谎言家729更新片子整体不如湄公河行动1整体不够流畅编剧有毒台词尴尬2刻意做作的主旋律煽情显得如此不合时宜而又多余',\n",
       " '凭良心说好看到不像战狼1的续集完虐湄公河行动',\n",
       " '中二得很']"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def token(string):\n",
    "    return re.sub('[^\\w]', '', string)\n",
    "\n",
    "cleaned_comments = [token(str(s)) for s in comments]\n",
    "cleaned_comments[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['吴京', '意淫', '到', '了', '脑残', '的', '地步', '看', '了', '恶心']"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def cut(string):\n",
    "    return list(jieba.cut(string))\n",
    "\n",
    "\n",
    "TOKEN = [w for s in cleaned_comments for w in cut(s)]\n",
    "TOKEN[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('的', 328262),\n",
       " ('了', 102420),\n",
       " ('是', 73106),\n",
       " ('我', 50338),\n",
       " ('都', 36255),\n",
       " ('很', 34712),\n",
       " ('看', 34022),\n",
       " ('电影', 33675),\n",
       " ('也', 32065),\n",
       " ('和', 31290)]"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words_count = Counter(TOKEN)\n",
    "words_count.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.07310448068987618"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def prob_1(word):\n",
    "    return words_count[word] / len(TOKEN)\n",
    "\n",
    "\n",
    "prob_1('的')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['吴京意淫', '意淫到', '到了', '了脑残', '脑残的', '的地步', '地步看', '看了', '了恶心', '恶心想']"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TOKEN_2_GRAM = [''.join(TOKEN[i:i + 2]) for i in range(len(TOKEN) - 2)]\n",
    "TOKEN_2_GRAM[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('的电影', 8640), ('看的', 7106), ('都是', 6335), ('让人', 5284), ('的故事', 4709)]"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words_count_2 = Counter(TOKEN_2_GRAM)\n",
    "words_count_2.most_common(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0010210874035228295"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def prob_2(word1, word2):\n",
    "    if word1 + word2 in words_count_2:\n",
    "        return words_count_2[word1 + word2] / len(TOKEN_2_GRAM)\n",
    "    else:\n",
    "        return 1 / len(TOKEN_2_GRAM)\n",
    "\n",
    "\n",
    "prob_2('看', '了')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_probability(sentence):\n",
    "    sentence = token(sentence)\n",
    "    words = cut(sentence)\n",
    "    sentence_pro = 1\n",
    "    for i, word in enumerate(words[:-1]):\n",
    "        next_ = words[i + 1]\n",
    "        probability = prob_2(word, next_)\n",
    "        sentence_pro *= probability\n",
    "    return sentence_pro"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sentence: 亲,您好,我是客服小芳,你想咨询购买须知和产品详情吗？ with Prb: 1.715403233391157e-88\n",
      "sentence: 先生,您好,我是718号客服,您想了解退货流程和产品详情和使用方法吗？ with Prb: 3.3552615650727975e-115\n",
      "sentence: 亲,你好,我是375号客服,你想了解购买须知吗？ with Prb: 3.0436992639017126e-73\n",
      "sentence: 帅哥,您好,我是7号客服,你想咨询产品维修和人工客服和产品详情和人工客服和产品维修吗？ with Prb: 9.26562065280927e-154\n",
      "sentence: 美女,你好,我是客服张三,你想知道购买须知吗？ with Prb: 1.8388537547615324e-65\n"
     ]
    }
   ],
   "source": [
    "for _ in range(5):\n",
    "    sen = generate(gram=example_gram, target='ask')\n",
    "    prob = get_probability(sen)\n",
    "    words = cut(token(sen))\n",
    "    print('sentence: {} with Prb: {}'.format(sen, prob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 sentence: 吴京意淫到了脑残的地步看了恶心想吐 with Prb: 1.3855036147011206e-57\n",
      "1 sentence: 恶心想脑残吴京到的吐了地步看了意淫 with Prb: 1.901167206672156e-59\n",
      "2 sentence: 了地步到想吐吴京脑残了恶心看的意淫 with Prb: 1.9704470984291107e-61\n",
      "3 sentence: 地步想脑残到的看吴京了恶心了意淫吐 with Prb: 1.1219598424450702e-60\n",
      "4 sentence: 看了脑残的吴京意淫想地步了吐恶心到 with Prb: 7.746165454131738e-62\n",
      "5 sentence: 恶心脑残的到看地步了了吴京意淫吐想 with Prb: 3.227580020018959e-58\n",
      "6 sentence: 了想到脑残了意淫看地步的吴京恶心吐 with Prb: 2.3766266775518214e-63\n",
      "7 sentence: 了吐到的了看想意淫脑残地步恶心吴京 with Prb: 4.199737024666696e-64\n",
      "8 sentence: 想脑残了到吐意淫了的看吴京地步恶心 with Prb: 1.1560895187117663e-62\n",
      "9 sentence: 到了想意淫了吴京脑残恶心吐的地步看 with Prb: 1.402169399564516e-63\n"
     ]
    }
   ],
   "source": [
    "test_1 = cleaned_comments[0]\n",
    "print('0'+ ' sentence: {} with Prb: {}'.format(test_1, get_probability(test_1)))\n",
    "for i in range(1, 10):\n",
    "    words = cut(test_1)\n",
    "    random.shuffle(words)\n",
    "    test_2 = ''.join(words)\n",
    "    print(str(i) + ' sentence: {} with Prb: {}'.format(test_2, get_probability(test_2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'小朋友,你好,我是3号客服,你想咨询优惠券吗？'"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 从生成的句子中获取概率最高的句子\n",
    "def generate_best(gram, model):\n",
    "    sentences = generate_n(n=20, gram=gram, target='ask')\n",
    "    return max(sentences, key=model)\n",
    "\n",
    "\n",
    "generate_best(example_gram, get_probability)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'小朋友,您好,我是客服小静,你想咨询优惠券吗？'"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def generate_best_2(gram, model):\n",
    "    n = 20\n",
    "    sentence_prob = []\n",
    "    for i in range(n):\n",
    "        sentence = generate(gram=gram, target='ask')\n",
    "        prob = model(sentence)\n",
    "        sentence_prob.append((sentence, prob))\n",
    "    return sorted(sentence_prob, key=lambda x: x[1], reverse=True)[0][0]\n",
    "\n",
    "\n",
    "generate_best_2(example_gram, get_probability)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "3. Problem Solving: Search based"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "4. Mathematical or Analytic Based"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "5. Machine Learning (deep learning)\n",
    "Based"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
